Chapter 4 MAG catalogue

load("data/data.Rdata")

4.1 filter samples with high host data

sample_metadata <- sample_metadata%>%
  filter(!sample %in% c("EHI02721", "EHI02712", "EHI02700", "EHI02720", "EHI02749", "EHI02719", "EHI02729", "EHI02715", "EHI02722"))

genome_counts_filt <- genome_counts %>% 
    select(one_of(c("genome",sample_metadata$sample)))%>%
  filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
  select_if(~!all(. == 0))

genome_metadata <- genome_metadata %>% 
  semi_join(., genome_counts_filt, by = "genome") %>% 
  arrange(match(genome,genome_counts_filt$genome))

genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tips

#load("data/genome_gifts.Rdata")

4.2 Genome phylogeny

# Generate the phylum color heatmap
phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(genome,phylum) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome")

# Generate new species table
newspecies_table <- genome_metadata %>% 
  mutate(newspecies=ifelse(species=="s__","Y","N")) %>% 
  select(genome,newspecies) %>% 
  column_to_rownames(var = "genome")
  
# Generate wild table
wild_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(diet=="Wild") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate captive table
captive_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(diet=="Pre_grass") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate grass table
grass_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(diet=="Post_grass") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate  basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
    ggtree(., layout="fan", open.angle=10, size=0.2)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
# Add phylum ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.05, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=phylum_colors) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))

# Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()

# Add completeness ring
circular_tree <- circular_tree +
        new_scale_fill() +
        scale_fill_gradient(low = "#d1f4ba", high = "#f4baba") +
        geom_fruit(
                data=genome_metadata,
                geom=geom_bar,
                mapping = aes(x=completeness, y=genome, fill=contamination),
                offset = 0.3,
                pwidth = 0.1,
                orientation="y",
              stat="identity")

# Add genome-size ring
circular_tree <- circular_tree + new_scale_fill()

circular_tree <- gheatmap(circular_tree, wild_heatmap, offset=0.3, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#74C8AE")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

circular_tree <- gheatmap(circular_tree, captive_heatmap, offset=0.4, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#F4B02E")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

circular_tree <- gheatmap(circular_tree, grass_heatmap, offset=0.5, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#E6771D")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

circular_tree <- gheatmap(circular_tree, newspecies_table, offset=1.2, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#f4f4f4","#666666"))
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))
List of 3
 $ legend.position: chr "none"
 $ plot.margin    : 'margin' num [1:4] 0points 0points 0points 0points
  ..- attr(*, "unit")= int 8
 $ panel.spacing  : 'margin' num [1:4] 0points 0points 0points 0points
  ..- attr(*, "unit")= int 8
 - attr(*, "class")= chr [1:2] "theme" "gg"
 - attr(*, "complete")= logi FALSE
 - attr(*, "validate")= logi TRUE
# Add text
circular_tree <-  circular_tree +
        annotate('text', x=2.9, y=0, label='              Phylum', family='arial', size=3.5) +
        annotate('text', x=3.3, y=0, label='            Group', family='arial', size=3.5) +
        annotate('text', x=3.8, y=0, label='                         Genome quality', family='arial', size=3.5) +
        annotate('text', x=4.2, y=0, label='                     New species', family='arial', size=3.5)

#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)

4.3 Taxonomy overview

tax_mag <-genome_metadata %>% 
  group_by(phylum) %>%
  summarise(mag_n=n()) 

tax_mag %>%
  mutate(percetage_mag=round(mag_n*100/sum(mag_n), 2)) %>% 
  arrange(-percetage_mag) %>%
  tt()
phylum mag_n percetage_mag
p__Bacillota_A 526 61.16
p__Bacteroidota 127 14.77
p__Bacillota 53 6.16
p__Pseudomonadota 39 4.53
p__Verrucomicrobiota 35 4.07
p__Synergistota 18 2.09
p__Actinomycetota 15 1.74
p__Patescibacteria 13 1.51
p__Desulfobacterota 12 1.40
p__Bacillota_C 9 1.05
p__Cyanobacteriota 7 0.81
p__Bacillota_B 2 0.23
p__Spirochaetota 2 0.23
p__Campylobacterota 1 0.12
p__Methanobacteriota 1 0.12

4.4 Mag size (MB)

genome_metadata <-genome_metadata %>% 
  mutate(corrected_size=100*length/completeness)

Mags average size (MB)

genome_metadata %>% 
  summarise(Average_corrected_size=mean(corrected_size)) %>% 
  tt()
Average_corrected_size
2651482

Minimum Mags size (MB)

genome_metadata %>% 
  filter(corrected_size==min(corrected_size)) %>% 
  tt()
genome domain phylum class order family genus species completeness contamination length corrected_size
EHA03306_bin.234 d__Bacteria p__Patescibacteria c__Saccharimonadia o__Saccharimonadales f__Nanosyncoccaceae g__Nanosyncoccus s__ 52.57 7.13 326026 620175

Maximum Mags size (MB)

genome_metadata %>% 
  filter(corrected_size==max(corrected_size)) %>% 
  tt()
genome domain phylum class order family genus species completeness contamination length corrected_size
EHA04588_bin.47 d__Bacteria p__Bacillota_A c__Clostridia o__Oscillospirales f__Oscillospiraceae g__Lawsonibacter s__ 56.31 5.39 5576430 9903090

Mags arrange by size (MB)

genome_metadata %>% 
  arrange(corrected_size) %>% 
  paged_table()

Mags average size and completeness by phylum

genome_metadata %>% 
  group_by(phylum) %>%
  summarise(average_size=mean(corrected_size),sd_size=sd(corrected_size),average_comp=mean(completeness),sd_comp=sd(completeness)) %>%
  mutate(Size=str_c(round(average_size,2),"±",round(sd_size,2)),
         Completeness=str_c(round(average_comp,2),"±",round(sd_comp,2))) %>% 
  arrange(-average_size) %>% 
  select(phylum, Size, Completeness) %>% 
  tt()
phylum Size Completeness
p__Bacteroidota 4150649.86±1672281.53 76.62±17.72
p__Pseudomonadota 2604751.8±835336.03 82.68±17.96
p__Verrucomicrobiota 2538115.51±808033.83 81.21±14.05
p__Bacillota_A 2534693.22±882541.15 81.5±16.81
p__Desulfobacterota 2207699.48±163896.46 85.49±13.65
p__Synergistota 2141226.18±176298.47 75.8±16.94
p__Spirochaetota 2053326.59±36521.71 94.71±0.12
p__Actinomycetota 1998831.26±620954.65 76.63±18.44
p__Bacillota_B 1932803.47±73181.05 90±2.28
p__Cyanobacteriota 1877859.37±192085.35 92.95±17.85
p__Methanobacteriota NA NA
p__Bacillota_C 1638316.23±205183.8 90.94±16.02
p__Bacillota 1551655.29±517109.39 83.87±12.72
p__Campylobacterota NA NA
p__Patescibacteria 1042754.15±629671.65 77.42±17.81

Mags average size and completeness by phylum in wild

wild_data <-sample_metadata %>% 
  filter(diet=="Wild")
  
  
wild_genome <- genome_counts %>% 
  select(genome, all_of(wild_data$sample)) %>%
  filter(rowSums(across(where(is.numeric)))!=0) 
  

wild_genome_metadata <- genome_metadata %>% 
  filter(genome %in% wild_genome$genome) %>% 
  arrange(match(genome,wild_genome$genome))

wild_genome_metadata %>% 
  group_by(phylum) %>%
  summarise(average_size=mean(corrected_size),sd_size=sd(corrected_size),average_comp=mean(completeness),sd_comp=sd(completeness)) %>%
  mutate(Size=str_c(round(average_size,2),"±",round(sd_size,2)),
         Completeness=str_c(round(average_comp,2),"±",round(sd_comp,2))) %>% 
  arrange(-average_size) %>% 
  select(phylum, Size, Completeness) %>% 
  tt()
phylum Size Completeness
p__Bacteroidota 4150649.86±1672281.53 76.62±17.72
p__Pseudomonadota 2616167.41±843460.54 83.21±17.89
p__Verrucomicrobiota 2538115.51±808033.83 81.21±14.05
p__Bacillota_A 2537653.23±880765.5 81.48±16.82
p__Desulfobacterota 2207699.48±163896.46 85.49±13.65
p__Synergistota 2141226.18±176298.47 75.8±16.94
p__Spirochaetota 2053326.59±36521.71 94.71±0.12
p__Actinomycetota 1998831.26±620954.65 76.63±18.44
p__Bacillota_B 1932803.47±73181.05 90±2.28
p__Cyanobacteriota 1877859.37±192085.35 92.95±17.85
p__Bacillota_C 1638316.23±205183.8 90.94±16.02
p__Bacillota 1551655.29±517109.39 83.87±12.72
p__Campylobacterota NA NA
p__Patescibacteria 1042754.15±629671.65 77.42±17.81
wild_genome_metadata %>% 
  summarise(average_size=mean(corrected_size),sd_size=sd(corrected_size),average_comp=mean(completeness),sd_comp=sd(completeness)) %>%
  mutate(Size=str_c(round(average_size,2),"±",round(sd_size,2)),
         Completeness=str_c(round(average_comp,2),"±",round(sd_comp,2))) %>% 
  arrange(-average_size) %>% 
  select(-Size, -Completeness) %>% 
  tt()
average_size sd_size average_comp sd_comp
2655093 1210519 80.9823 16.79731

Mags average size and completeness by phylum in captivity

pre_data <-sample_metadata %>% 
  filter(diet=="Pre_grass")
  
  
pre_genome <- genome_counts %>% 
  select(genome, all_of(pre_data$sample)) %>%
  filter(rowSums(across(where(is.numeric)))!=0) 
  

pre_genome_metadata <- genome_metadata %>% 
  filter(genome %in% pre_genome$genome) %>% 
  arrange(match(genome,pre_genome$genome))

pre_genome_metadata %>% 
  group_by(phylum) %>%
  summarise(average_size=mean(corrected_size),sd_size=sd(corrected_size),average_comp=mean(completeness),sd_comp=sd(completeness)) %>%
  mutate(Size=str_c(round(average_size,2),"±",round(sd_size,2)),
         Completeness=str_c(round(average_comp,2),"±",round(sd_comp,2))) %>% 
  arrange(-average_size) %>% 
  select(phylum, Size, Completeness) %>% 
  tt()
phylum Size Completeness
p__Bacteroidota 4150649.86±1672281.53 76.62±17.72
p__Pseudomonadota 2605452.27±846537.48 82.87±18.16
p__Verrucomicrobiota 2538115.51±808033.83 81.21±14.05
p__Bacillota_A 2534693.22±882541.15 81.5±16.81
p__Desulfobacterota 2207699.48±163896.46 85.49±13.65
p__Synergistota 2141226.18±176298.47 75.8±16.94
p__Spirochaetota 2053326.59±36521.71 94.71±0.12
p__Actinomycetota 1998831.26±620954.65 76.63±18.44
p__Bacillota_B 1932803.47±73181.05 90±2.28
p__Cyanobacteriota 1877859.37±192085.35 92.95±17.85
p__Methanobacteriota NA NA
p__Bacillota_C 1638316.23±205183.8 90.94±16.02
p__Bacillota 1551655.29±517109.39 83.87±12.72
p__Campylobacterota NA NA
p__Patescibacteria 1048485.03±657316.38 76.17±17.99
pre_genome_metadata %>% 
  summarise(average_size=mean(corrected_size),sd_size=sd(corrected_size),average_comp=mean(completeness),sd_comp=sd(completeness)) %>%
  mutate(Size=str_c(round(average_size,2),"±",round(sd_size,2)),
         Completeness=str_c(round(average_comp,2),"±",round(sd_comp,2))) %>% 
  arrange(-average_size) %>% 
  select(-Size, -Completeness)
# A tibble: 1 × 4
  average_size  sd_size average_comp sd_comp
         <dbl>    <dbl>        <dbl>   <dbl>
1     2653523. 1210343.         81.0    16.8

4.5 Genome quality

genome_metadata %>% 
    summarise(completeness_mean=mean(completeness) %>% round(2) %>% as.character(), 
              completeness_sd=sd(completeness) %>% round(2) %>% as.character(), 
              contamination_mean=mean(contamination) %>% round(2), 
              contamination_sd=sd(contamination) %>% round(2)) %>%
    unite("Completeness",completeness_mean, completeness_sd, sep = " ± ", remove = TRUE) %>%
    unite("Contamination",contamination_mean, contamination_sd, sep = " ± ", remove = TRUE) %>%
    tt()
Completeness Contamination
80.99 ± 16.79 2.21 ± 2.32
#Generate quality biplot
genome_biplot <- genome_metadata %>%
  select(c(genome,domain,phylum,completeness,contamination,length)) %>%
  arrange(match(genome, rev(genome_tree$tip.label))) %>% #sort MAGs according to phylogenetic tree
  ggplot(aes(x=completeness,y=contamination,size=length,color=phylum)) +
              geom_point(alpha=0.7) +
                    ylim(c(10,0)) +
                    scale_color_manual(values=phylum_colors) +
                    labs(y= "Contamination", x = "Completeness") +
                    theme_classic() +
                    theme(legend.position = "none")

#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
            ggplot(aes(y=contamination)) +
                    ylim(c(10,0)) +
                    geom_boxplot(colour = "#999999", fill="#cccccc") +
                    theme_void() +
                    theme(legend.position = "none",
                        axis.title.x = element_blank(),
                        axis.title.y = element_blank(),
                        axis.text.y=element_blank(),
                        axis.ticks.y=element_blank(),
                        axis.text.x=element_blank(),
                        axis.ticks.x=element_blank(),
                        plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)

#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
        ggplot(aes(x=completeness)) +
                xlim(c(50,100)) +
                geom_boxplot(colour = "#999999", fill="#cccccc") +
                theme_void() +
                theme(legend.position = "none",
                    axis.title.x = element_blank(),
                    axis.title.y = element_blank(),
                    axis.text.y=element_blank(),
                    axis.ticks.y=element_blank(),
                    axis.text.x=element_blank(),
                    axis.ticks.x=element_blank(),
                    plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)

#Render composite figure
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
        layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3)))

4.6 Functional overview

genome_annotations <- read_tsv("data/genome_annotations.tsv.xz") %>%
    rename(gene=1, genome=2, contig=3)

# Predicted genes
genome_annotations %>%
  nrow()
[1] 1362707
# Some annotation
genome_annotations %>%
  filter(if_any(c(kegg_id, pfam_hits, cazy_hits), ~ !is.na(.))) %>% 
  nrow()
[1] 1045555
# KEGG annotation
genome_annotations %>%
  filter(!is.na(kegg_id)) %>% 
  nrow()
[1] 610762
# Aggregate basal GIFT into elements
genome_gifts<-genome_gifts_raw
function_table <- genome_gifts %>%
    to.elements(., GIFT_db)

# Generate  basal tree
function_tree <- force.ultrametric(genome_tree, method="extend") %>%
                ggtree(., size = 0.3) 
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
#Add phylum colors next to the tree tips
function_tree <- gheatmap(function_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE) +
            scale_fill_manual(values=phylum_colors) +
            labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

#Add functions heatmap
function_tree <- gheatmap(function_tree, function_table, offset=0.5, width=3.5, colnames=FALSE) +
            vexpand(.08) +
            coord_cartesian(clip = "off") +
            scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white") +
            labs(fill="GIFT")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

# Add completeness barplots
function_tree <- function_tree +
            geom_fruit(data=genome_metadata,
            geom=geom_bar,
            grid.params=list(axis="x", text.size=2, nbreak = 1),
            axis.params=list(vline=TRUE),
            mapping = aes(x=length, y=genome, fill=completeness),
                 offset = 3.8,
                 orientation="y",
                 stat="identity") +
            scale_fill_gradient(low = "#cf8888", high = "#a2cc87") +
            labs(fill="Genome\ncompleteness")

function_tree

4.7 Functional ordination

# Generate the tSNE ordination
tSNE_function <- Rtsne(X=function_table, dims = 2, check_duplicates = FALSE)

# Plot the ordination
function_ordination <- tSNE_function$Y %>%
                as.data.frame() %>%
                mutate(genome=rownames(function_table)) %>%
                inner_join(genome_metadata, by="genome") %>%
                rename(tSNE1="V1", tSNE2="V2") %>%
                select(genome,phylum,tSNE1,tSNE2, length) %>%
                ggplot(aes(x = tSNE1, y = tSNE2, color = phylum, size=length))+
                            geom_point(shape=16, alpha=0.7) +
                            scale_color_manual(values=phylum_colors) +
                            theme_minimal() +
                labs(color="Phylum", size="Genome size") +
                guides(color = guide_legend(override.aes = list(size = 5))) # enlarge Phylum dots in legend

function_ordination